load data
credit <- read.csv('Credit.csv')
dim(credit)
## [1] 400 12
head(credit)
## ID Income Limit Rating Cards Age Education Gender Student Married Ethnicity
## 1 1 14.891 3606 283 2 34 11 Male No Yes Caucasian
## 2 2 106.025 6645 483 3 82 15 Female Yes Yes Asian
## 3 3 104.593 7075 514 4 71 11 Male No No Asian
## 4 4 148.924 9504 681 3 36 11 Female No No Asian
## 5 5 55.882 4897 357 2 68 16 Male No Yes Caucasian
## 6 6 80.180 8047 569 4 77 10 Male No No Caucasian
## Balance
## 1 333
## 2 903
## 3 580
## 4 964
## 5 331
## 6 1151
str(credit)
## 'data.frame': 400 obs. of 12 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : num 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : chr " Male" "Female" " Male" "Female" ...
## $ Student : chr "No" "Yes" "No" "No" ...
## $ Married : chr "Yes" "Yes" "No" "No" ...
## $ Ethnicity: chr "Caucasian" "Asian" "Asian" "Asian" ...
## $ Balance : int 333 903 580 964 331 1151 203 872 279 1350 ...
credit=credit[,-1]
#splitting data into a training set and a testing set.
selected <- sample(1:dim(credit)[1], size=round(dim(credit)[1]*0.7))
train <- credit[selected,] # 70% t which means Randomly select 280 observations for Traning set
test <- credit[-selected,] # 30% which means Randomly select 120 observations for Test set
svm regression
-Support Vector Machines is a supervised learning algorithm which can work for both classification and regression problems.
-The main objective of the SVM is to find the optimum hyperplane (i.e. 2D line and 3D plane) that maximises the margin (i.e. twice the distance between the closest data point and hyperplane) between two classes.
Fit SVM with different kernels and assess the accuracy of the prediction from 10-fold cross-validation
#install.packages('e1071')
library(e1071)
#The "kernel" argument is set to "linear" to use a linear kernel, which is appropriate when the data is linearly separable.
#The "cross" argument is set to 10 to perform 10-fold cross-validation, which helps to estimate the performance of the model on new, unseen data.
fitsvml <- svm(Income~., kernel="linear", cross=10, data=train)
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(fitsvml)
##
## Call:
## svm(formula = Income ~ ., data = train, kernel = "linear", cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.08333333
## epsilon: 0.1
##
##
## Number of Support Vectors: 140
##
##
##
## 10-fold cross-validation on training data:
##
## Total Mean Squared Error: 152.3945
## Squared Correlation Coefficient: 0.9017399
## Mean Squared Errors:
## 163.2028 187.1172 130.7224 148.904 200.2687 235.632 55.28169 82.69432 103.083 217.0386
#predict the model using test dataset
# generate predictions using SVM model
pred <- predict(fitsvml, newdata = test)
# create confusion matrix
confusion <- table(test$Income, pred)
# print confusion matrix
#confusion
# calculate RMSE
accuracy <- sum(diag(confusion))/sum(confusion)
rmse <- sqrt(mean((test$Income - pred)^2))
cat("Accuracy: ", round(accuracy, 3), "\n")
## Accuracy: 0.125
cat("RMSE: ", round(rmse, 3))
## RMSE: 12.28
kernel=“polynomial”
#The "kernel= "polynomial" can capture non-linear relationships in the data
fitsvmp <- svm(Income~., kernel="polynomial", cross=10, data=train)
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(fitsvmp)
##
## Call:
## svm(formula = Income ~ ., data = train, kernel = "polynomial", cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## gamma: 0.08333333
## coef.0: 0
## epsilon: 0.1
##
##
## Number of Support Vectors: 233
##
##
##
## 10-fold cross-validation on training data:
##
## Total Mean Squared Error: 443.0724
## Squared Correlation Coefficient: 0.6641806
## Mean Squared Errors:
## 450.6859 139.7488 322.2924 626.7203 177.7441 574.7849 539.273 392.1623 570.5982 636.7145
#predict the model using test dataset
# generate predictions using SVM model
pred <- predict(fitsvmp, newdata = test)
# create confusion matrix
confusion <- table(test$Income, pred)
# print confusion matrix
#confusion
# calculate RMSE
accuracy <- sum(diag(confusion))/sum(confusion)
rmse <- sqrt(mean((test$Income - pred)^2))
cat("Accuracy: ", round(accuracy, 3), "\n")
## Accuracy: 0.017
cat("RMSE: ", round(rmse, 3))
## RMSE: 29.857
kernel=“radial”
#The kernel= "radial" to use an RBF kernel, which is a popular kernel function in SVMs that can capture non-linear relationships in the data.
fitsvmr <- svm(Income~., kernel="radial", cross=10, data=train)
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(fitsvmr)
##
## Call:
## svm(formula = Income ~ ., data = train, kernel = "radial", cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.08333333
## epsilon: 0.1
##
##
## Number of Support Vectors: 177
##
##
##
## 10-fold cross-validation on training data:
##
## Total Mean Squared Error: 117.4282
## Squared Correlation Coefficient: 0.9239854
## Mean Squared Errors:
## 93.27741 66.19795 163.1243 78.06308 160.4165 163.9439 67.66245 88.33344 75.64436 217.6183
# generate predictions using SVM model for kernel="radial",
predvmr <- predict(fitsvmr, newdata = test)
# create confusion matrix
confusion <- table(test$Income, pred)
# print confusion matrix
#confusion
# calculate RMSE
accuracy <- sum(diag(confusion))/sum(confusion)
rmse <- sqrt(mean((test$Income - predvmr)^2))
cat("Accuracy: ", round(accuracy, 3), "\n")
## Accuracy: 0.017
cat("RMSE: ", round(rmse, 3))
## RMSE: 10.897
#predsvmr <- predict(fitsvmr, test)
#sqrt(sum((test$Income-predsvmr)^2))
#predsvms <- predict(fitsvms, test)
#sqrt(sum((test$Income-predsvms)^2))
kernel=“sigmoid”
library(ggplot2)
fitsvms <- svm(Income~., kernel="sigmoid", cross=10, data=train)
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(fitsvms)
##
## Call:
## svm(formula = Income ~ ., data = train, kernel = "sigmoid", cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: sigmoid
## cost: 1
## gamma: 0.08333333
## coef.0: 0
## epsilon: 0.1
##
##
## Number of Support Vectors: 256
##
##
##
## 10-fold cross-validation on training data:
##
## Total Mean Squared Error: 4213.12
## Squared Correlation Coefficient: 0.04955032
## Mean Squared Errors:
## 5617.584 805.201 2713.527 9535.845 582.8176 738.4147 9503.763 2362.052 1771.648 8500.353
# generate predictions using SVM model for kernel="sigmoid"
predvms <- predict(fitsvms, newdata = test)
# create confusion matrix
confusion <- table(test$Income, predvms)
# print confusion matrix
#confusion
# calculate RMSE
accuracy <- sum(diag(confusion))/sum(confusion)
rmse <- sqrt(mean((test$Income - predvmr)^2))
cat("Accuracy: ", round(accuracy, 3), "\n")
## Accuracy: 0.008
cat("RMSE: ", round(rmse, 3))
## RMSE: 10.897
Major advantages of using SVM are that:
it works well with large number of predictors. Works well in-case of non-linear separable data. Works well in-case of image classification and does not suffer multicollinearity problem.
load data
#splitting data into a training set and a testing set.
dim(iris)
## [1] 150 5
selected <- sample(1:dim(iris)[1], size=round(dim(iris)[1]*0.7))
train <- iris[selected,]
train
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 14 4.3 3.0 1.1 0.1 setosa
## 80 5.7 2.6 3.5 1.0 versicolor
## 122 5.6 2.8 4.9 2.0 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 26 5.0 3.0 1.6 0.2 setosa
## 141 6.7 3.1 5.6 2.4 virginica
## 31 4.8 3.1 1.6 0.2 setosa
## 136 7.7 3.0 6.1 2.3 virginica
## 38 4.9 3.6 1.4 0.1 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 82 5.5 2.4 3.7 1.0 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 13 4.8 3.0 1.4 0.1 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 137 6.3 3.4 5.6 2.4 virginica
## 86 6.0 3.4 4.5 1.6 versicolor
## 113 6.8 3.0 5.5 2.1 virginica
## 93 5.8 2.6 4.0 1.2 versicolor
## 145 6.7 3.3 5.7 2.5 virginica
## 74 6.1 2.8 4.7 1.2 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 18 5.1 3.5 1.4 0.3 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 112 6.4 2.7 5.3 1.9 virginica
## 43 4.4 3.2 1.3 0.2 setosa
## 144 6.8 3.2 5.9 2.3 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 84 6.0 2.7 5.1 1.6 versicolor
## 33 5.2 4.1 1.5 0.1 setosa
## 85 5.4 3.0 4.5 1.5 versicolor
## 28 5.2 3.5 1.5 0.2 setosa
## 131 7.4 2.8 6.1 1.9 virginica
## 90 5.5 2.5 4.0 1.3 versicolor
## 138 6.4 3.1 5.5 1.8 virginica
## 73 6.3 2.5 4.9 1.5 versicolor
## 30 4.7 3.2 1.6 0.2 setosa
## 1 5.1 3.5 1.4 0.2 setosa
## 124 6.3 2.7 4.9 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 101 6.3 3.3 6.0 2.5 virginica
## 16 5.7 4.4 1.5 0.4 setosa
## 95 5.6 2.7 4.2 1.3 versicolor
## 34 5.5 4.2 1.4 0.2 setosa
## 119 7.7 2.6 6.9 2.3 virginica
## 45 5.1 3.8 1.9 0.4 setosa
## 143 5.8 2.7 5.1 1.9 virginica
## 70 5.6 2.5 3.9 1.1 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 118 7.7 3.8 6.7 2.2 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 64 6.1 2.9 4.7 1.4 versicolor
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 69 6.2 2.2 4.5 1.5 versicolor
## 142 6.9 3.1 5.1 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 44 5.0 3.5 1.6 0.6 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 66 6.7 3.1 4.4 1.4 versicolor
## 15 5.8 4.0 1.2 0.2 setosa
## 89 5.6 3.0 4.1 1.3 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 105 6.5 3.0 5.8 2.2 virginica
## 55 6.5 2.8 4.6 1.5 versicolor
## 49 5.3 3.7 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 87 6.7 3.1 4.7 1.5 versicolor
## 22 5.1 3.7 1.5 0.4 setosa
## 125 6.7 3.3 5.7 2.1 virginica
## 60 5.2 2.7 3.9 1.4 versicolor
## 126 7.2 3.2 6.0 1.8 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 24 5.1 3.3 1.7 0.5 setosa
## 63 6.0 2.2 4.0 1.0 versicolor
## 123 7.7 2.8 6.7 2.0 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 61 5.0 2.0 3.5 1.0 versicolor
## 108 7.3 2.9 6.3 1.8 virginica
## 76 6.6 3.0 4.4 1.4 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 21 5.4 3.4 1.7 0.2 setosa
## 96 5.7 3.0 4.2 1.2 versicolor
## 129 6.4 2.8 5.6 2.1 virginica
## 35 4.9 3.1 1.5 0.2 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 150 5.9 3.0 5.1 1.8 virginica
## 40 5.1 3.4 1.5 0.2 setosa
## 57 6.3 3.3 4.7 1.6 versicolor
## 114 5.7 2.5 5.0 2.0 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 9 4.4 2.9 1.4 0.2 setosa
## 78 6.7 3.0 5.0 1.7 versicolor
## 109 6.7 2.5 5.8 1.8 virginica
## 68 5.8 2.7 4.1 1.0 versicolor
## 50 5.0 3.3 1.4 0.2 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 37 5.5 3.5 1.3 0.2 setosa
test <- iris[-selected,]
svm classification
fitsvml <- svm(as.factor(Species)~., kernel="linear", cross=10, data=train)
summary(fitsvml)
##
## Call:
## svm(formula = as.factor(Species) ~ ., data = train, kernel = "linear",
## cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 23
##
## ( 2 11 10 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 95.2381
## Single Accuracies:
## 100 90.90909 100 90.90909 100 100 100 100 90 81.81818
predsvml <- predict(fitsvml, test)
confusion=table(test$Species, predsvml)
sum(diag(confusion))/sum(confusion)
## [1] 0.9777778
# Plot decision boundary on two selected variables (for radial kernel)
plot(fitsvml, train, Sepal.Length~Sepal.Width, color.palette=terrain.colors)
# Plot decision boundary on two selected variables (for radial kernel)
plot(fitsvml, train, Petal.Length~Petal.Width, color.palette=terrain.colors)
#plot model
kernel=“polynomial”
fitsvmp <- svm(as.factor(Species)~., kernel="polynomial", cross=10, data=train)
summary(fitsvmp)
##
## Call:
## svm(formula = as.factor(Species) ~ ., data = train, kernel = "polynomial",
## cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 44
##
## ( 5 21 18 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 88.57143
## Single Accuracies:
## 100 81.81818 90 90.90909 80 100 80 90.90909 90 81.81818
#predicting the model
predsvmp <- predict(fitsvmp, test)
#Confusion matrix
confusion=table(test$Species, predsvmp)
#Accuracy prediction
sum(diag(confusion))/sum(confusion)
## [1] 0.8666667
# Plot decision boundary on two selected variables (for radial kernel)
plot(fitsvmp, train, Sepal.Length~Sepal.Width, color.palette=terrain.colors)
# kernel="radial"
fitsvmr <- svm(as.factor(Species)~., kernel="radial", cross=10, data=train)
summary(fitsvmr)
##
## Call:
## svm(formula = as.factor(Species) ~ ., data = train, kernel = "radial",
## cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 43
##
## ( 7 18 18 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 96.19048
## Single Accuracies:
## 100 100 90 100 90 90.90909 100 90.90909 100 100
predsvmr <- predict(fitsvmr, test)
confusion=table(test$Species, predsvmr)
sum(diag(confusion))/sum(confusion)
## [1] 0.9555556
##kernel="sigmoid"
fitsvms <- svm(as.factor(Species)~., kernel="sigmoid", cross=10, data=train)
summary(fitsvms)
##
## Call:
## svm(formula = as.factor(Species) ~ ., data = train, kernel = "sigmoid",
## cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## coef.0: 0
##
## Number of Support Vectors: 42
##
## ( 5 21 16 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 88.57143
## Single Accuracies:
## 100 81.81818 90 81.81818 90 100 90 90.90909 80 81.81818
predsvms <- predict(fitsvms, test)
confusion=table(test$Species, predsvms)
sum(diag(confusion))/sum(confusion)
## [1] 0.9777778
conclusion based on all the above svm model kernel=“linear” model is the best model with 97% accuracy.
Nonlinear SVMs: the kernel trick
# Load the required library
library(ggplot2)
# Generate a random dataset
set.seed(123)
n <- 100
x1 <- runif(n, -5, 5)
x2 <- runif(n, -5, 5)
x3 <- x1^2 + x2^2
y <- ifelse(x3 < 10, -1, 1)
df <- data.frame(x1, x2, y)
# Train an SVM with a radial basis function (RBF) kernel
svm_model <- svm(y ~ ., data = df, kernel = "radial")
# Create a scatter plot in 2D with a circle
ggplot(df, aes(x = x1, y = x2, color = factor(y))) +
geom_point() +
geom_path(aes(x = sqrt(10)*cos(seq(0,2*pi,length.out=100)),
y = sqrt(10)*sin(seq(0,2*pi,length.out=100))),
linetype = "dashed", color = "black") +
scale_color_manual(values = c("green", "gray")) +
theme_classic()
# Load the required library
library(ggplot2)
# Generate a random dataset
set.seed(123)
n <- 100
x1 <- runif(n, -5, 5)
x2 <- runif(n, -5, 5)
x3 <- x1^2 + x2^2
y <- ifelse(x3 < 10, -1, 1)
df <- data.frame(x1, x2, y)
# Train an SVM with a radial basis function (RBF) kernel
svm_model <- svm(y ~ ., data = df, kernel = "radial")
# Create a scatter plot in 2D with a circle
ggplot(df, aes(x = x1, y = x2, color = factor(y))) +
geom_point() +
geom_path(aes(x = sqrt(10)*cos(seq(0,2*pi,length.out=100)),
y = sqrt(10)*sin(seq(0,2*pi,length.out=100))),
linetype = "dashed", color = "black") +
geom_polygon(aes(x = sqrt(10)*cos(seq(0,2*pi,length.out=100)),
y = sqrt(10)*sin(seq(0,2*pi,length.out=100))),
fill = "green", alpha = 0.2) +
scale_color_manual(values = c("green", "gray")) +
theme_classic()
# Load the required library
library(e1071)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Generate a random dataset
set.seed(123)
n <- 100
x1 <- runif(n, -5, 5)
x2 <- runif(n, -5, 5)
x3 <- x1^2 + x2^2
y <- ifelse(x3 < 10, -1, 1)
df <- data.frame(x1, x2, x3, y)
# Train an SVM with a radial basis function (RBF) kernel
svm_model <- svm(y ~ ., data = df, kernel = "radial")
# Create a scatter plot in 3D with a circle
fig <- plot_ly(df, x = ~x1, y = ~x2, z = ~x3, color = factor(y, levels = c(-1,1), labels = c("green", "gray")), type = "scatter3d") %>%
add_trace(x = cos(seq(0, 2*pi, length.out = 100))*sqrt(10),
y = sin(seq(0, 2*pi, length.out = 100))*sqrt(10),
z = 0,
type = "scatter3d", mode = "lines", line = list(color = "black"))
fig
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
SVM forRegression
library(e1071)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
N <- 100
PI <- 3.1415
domain <- 4*PI
nf <- 5
outliers <- 97
f <- function(x) sin(x)
set.seed(123)
generate_dataset <- function(fun) {
# Generate X values
X <- sort(domain * runif(N), decreasing = FALSE)
# Generate Y values
Y <- fun(X) + nf * 0.1 * (0.5 - runif(N))
Y[c(1:(N-outliers)) %% (N-outliers) == 0] <- Y[c(1:(N-outliers)) %% (N-outliers) == 0] + nf * (0.5 - runif(sum(c(1:(N-outliers)) %% (N-outliers) == 0)))
return(data.frame(X, Y))
}
dataset <- generate_dataset(f)
fit <- svm(Y ~ X, data = dataset, epsilon = 0.01)
pred <- predict(fit, data.frame(X = dataset$X))
fit1 <- svm(Y ~ X, data = dataset, epsilon = 0.1)
pred1 <- predict(fit1, data.frame(X = dataset$X))
fit2 <- svm(Y ~ X, data = dataset, epsilon = 1.0)
pred2 <- predict(fit2, data.frame(X = dataset$X))
fit3 <- svm(Y ~ X, data = dataset, epsilon = 1.5)
pred3 <- predict(fit3, data.frame(X = dataset$X))
plot(dataset$X, dataset$Y, col = "black", pch = 21,bg = "black", xlab = "X", ylab = "Y", main = "Support Vector Regression")
lines(dataset$X, f(dataset$X), col = "blue", lwd = 2)
lines(dataset$X, pred, col = "brown", lwd = 2)
lines(dataset$X, pred1, col = "sky blue", lwd = 2)
lines(dataset$X, pred2, col = "orange", lwd = 2)
lines(dataset$X, pred3, col = "red", lwd = 2)
legend("topright", legend = c("Data", "True", "epsilon 0.01","epsilon 0.1","epsilon 1.0","epsilon 2.0"), col = c("black","blue", "green", "brown","sky blue","orange","red"), lty = 1, lwd = 2, cex = 1.2)
# Define the feature map function
feature_map_1 <- function(X) {
return(cbind(X[,1], X[,2], X[,1]^2 + X[,2]^2))
}
# Generate dataset and feature map
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
set.seed(123)
data <- data.frame(mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(1, 0.5, 0.5, 1), nrow = 2)))
X <- as.matrix(data)
y <- ifelse(X[,1]^2 + X[,2]^2 < 0.7, 0, 1)
Z <- feature_map_1(X)
# 2D scatter plot
plot(X[,1], X[,2], col = ifelse(y == 1, "red", "blue"), pch = 16, xlab = "x1", ylab = "x2", main = "Original dataset")
# 3D scatter plot
library(scatterplot3d)
s3d <- scatterplot3d(Z, color = ifelse(y == 1, "red", "blue"), pch = 16, xlab = "z1", ylab = "z2", zlab = "z3", main = "Transformed dataset")